{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64974d33-d028-440c-86fa-1a0633b3d31d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# =============================================================================="
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3f0ff46-9958-4d57-9067-a64be34e75da",
   "metadata": {},
   "source": [
    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
    "\n",
    "# T5 Playground\n",
    "\n",
    "This notebook demonstrates T5 model on the task of translation and text summarization.\n",
    "\n",
    "The TensorRT HuggingFace T5 model is a plug-in replacement for the original PyTorch  HuggingFace T5 model.\n",
    "\n",
    "\n",
    "\n",
    "**Notes**: \n",
    " - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a T5 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n",
    "Average run time across 5 runs is reported.\n",
    " - Prior to running this notebook, run [t5.ipynb](t5.ipynb) to download the T5 model and generate the TensorRT engine."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3530e767-7050-4329-a4bc-e2221b9eb578",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "ROOT_DIR = os.path.abspath(\"../\")\n",
    "sys.path.append(ROOT_DIR)\n",
    "\n",
    "import torch \n",
    "\n",
    "# huggingface\n",
    "from transformers import (\n",
    "    T5ForConditionalGeneration,\n",
    "    T5Tokenizer,\n",
    "    T5Config,\n",
    ")\n",
    "\n",
    "# download HuggingFace model and tokernizer\n",
    "T5_VARIANT = 't5-small'\n",
    "\n",
    "t5_model = T5ForConditionalGeneration.from_pretrained(T5_VARIANT)\n",
    "tokenizer = T5Tokenizer.from_pretrained(T5_VARIANT)\n",
    "config = T5Config(T5_VARIANT)\n",
    "\n",
    "# load TensorRT engine\n",
    "from T5.trt import T5TRTEncoder, T5TRTDecoder, TRTHFRunner\n",
    "from T5.T5ModelConfig import T5ModelTRTConfig\n",
    "from T5.export import T5DecoderTRTEngine, T5EncoderTRTEngine\n",
    "from NNDF.networks import NetworkMetadata, Precision\n",
    "\n",
    "from transformers.generation_stopping_criteria import (\n",
    "    MaxLengthCriteria,\n",
    "    StoppingCriteriaList,\n",
    ")\n",
    "\n",
    "tfm_config = T5Config(\n",
    "    use_cache=True,\n",
    "    num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[T5_VARIANT],\n",
    ")\n",
    "metadata=NetworkMetadata(T5_VARIANT, Precision('fp16'), None)\n",
    "\n",
    "from os.path import exists\n",
    "encoder_path = './models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT)\n",
    "if not exists(encoder_path):\n",
    "    print(\"Error: TensorRT engine not found at {}. Please run t5.ipynb to generate the TensorRT engine first!\".format(encoder_path))\n",
    "else:\n",
    "    encoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-encoder.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)\n",
    "    decoder_engine = T5DecoderTRTEngine('./models/{}/tensorrt/{}-decoder-with-lm-head.onnx.engine'.format(T5_VARIANT,T5_VARIANT), metadata)\n",
    "\n",
    "t5_trt_encoder = T5TRTEncoder(encoder_engine, metadata, tfm_config)\n",
    "t5_trt_decoder = T5TRTDecoder(decoder_engine, metadata, tfm_config)\n",
    "\n",
    "decoder_input_ids = torch.full(\n",
    "    (1, 1), tokenizer.convert_tokens_to_ids(tokenizer.pad_token), dtype=torch.int32\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "766b8c94-ba8e-47c8-8624-57da462a0496",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import ipywidgets as widgets\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "device = widgets.RadioButtons(\n",
    "    options=['CPU - PyTorch', \n",
    "             'GPU - PyTorch', \n",
    "             'GPU - TensorRT'],\n",
    "    description='Device:',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "task = widgets.RadioButtons(\n",
    "    options=['En -> German', \n",
    "             'Summarize', \n",
    "             ],\n",
    "    description='Task:',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "paragraph_text = widgets.Textarea(\n",
    "    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps'\\\n",
    "    'such as recommenders, speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops'\\\n",
    "    'and layers before applying optimizations for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep'\\\n",
    "    'learning community can customize and extend these components to take advantage of powerful TensorRT optimizations for your apps.',\n",
    "    placeholder='Type something',\n",
    "    description='Context:',\n",
    "    disabled=False,\n",
    "    layout=widgets.Layout(width=\"auto\"),\n",
    "    rows=5,  \n",
    ")\n",
    "\n",
    "\n",
    "generated_text = widgets.Textarea(\n",
    "    value='...',\n",
    "    placeholder='Context',\n",
    "    description='T5 output:',\n",
    "    disabled=False,\n",
    "    layout=widgets.Layout(width=\"auto\"),\n",
    "    rows=5,\n",
    ")\n",
    "button = widgets.Button(description=\"Generate\")\n",
    "\n",
    "display(paragraph_text)\n",
    "display(generated_text)\n",
    "display(device)\n",
    "display(task)\n",
    "\n",
    "from IPython.display import display\n",
    "box_layout = widgets.Layout(display='flex',\n",
    "                flex_flow='column',\n",
    "                align_items='center',\n",
    "                width='100%')\n",
    "N_RUN = 6\n",
    "progress_bar = widgets.IntProgress(\n",
    "    value=0,\n",
    "    min=0,\n",
    "    max=N_RUN,\n",
    "    description='Progress:',\n",
    "    bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
    "    style={'bar_color': 'green'},\n",
    "    orientation='horizontal', \n",
    "    layout=widgets.Layout(width='100%', height='50px')\n",
    ")\n",
    "\n",
    "box = widgets.HBox(children=[button],layout=box_layout)\n",
    "output = widgets.Output()\n",
    "display(box)\n",
    "display(progress_bar)\n",
    "display(output)\n",
    "\n",
    "MAX_LENGTH = 256\n",
    "\n",
    "def generate(b):\n",
    "    progress_bar.value = 0\n",
    "    inference_time_arr = []\n",
    "    prefix = 'translate English to German' if task.value=='En -> German' else 'summarize'\n",
    "    inputs = tokenizer(\"{}: {}\".format(prefix, paragraph_text.value), return_tensors=\"pt\")\n",
    "    with output:\n",
    "        if device.value == 'GPU - TensorRT':\n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                encoder_last_hidden_state = t5_trt_encoder(input_ids=inputs.input_ids)\n",
    "                outputs = t5_trt_decoder.greedy_search(\n",
    "                            input_ids=decoder_input_ids,\n",
    "                            encoder_hidden_states=encoder_last_hidden_state,\n",
    "                            stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(MAX_LENGTH)])\n",
    "                        )\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))                   \n",
    "                \n",
    "        elif device.value == 'CPU - PyTorch':\n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                outputs = t5_model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=MAX_LENGTH)\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n",
    "            \n",
    "        elif  device.value == 'GPU - PyTorch':  \n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                outputs = t5_model.to('cuda:0').generate(inputs.input_ids.to('cuda:0'), max_length=MAX_LENGTH)\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))    \n",
    "            \n",
    "button.on_click(generate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58f473c0-6682-41af-8040-72f0a9472b0f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
